In this analysis we will,
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
import warnings
warnings.simplefilter('ignore',DeprecationWarning)
import seaborn as sns
import time
import copy
from datetime import datetime
from time import strftime
from pylab import rcParams
#import hdbscan
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
#from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn import metrics
from sklearn import metrics as mt
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import confusion_matrix as conf
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.cluster import KMeans
from tabulate import tabulate
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from __future__ import print_function
df = pd.read_csv('cluster_clean_data.csv')
df.columns
df.head()
# set required variables for model comparison
tsne_tbl = pd.DataFrame(columns = [
'model_name',
'perplexity',
'kl_divergence',
'process_time'])
i_index = []
i_index = 0
# preparation for cross validation and model comparison, each classifier is appended once model is fit
models = []
df_cluster = copy.deepcopy(df)
df_cluster.head()
df_cluster.set_index('id')
df_cluster.isnull().any()
df_cluster = pd.get_dummies(df_cluster)
df_cluster.head()
df_cluster.columns
df_cluster.set_index('id')
X1 = df_cluster
X1 = X1.sample(frac = 0.35)
X1.set_index('id')
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... save indexes from original dataframe for later re-joins
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
X1_index = list(X1.index.values)
len(X1_index)
df_out = pd.DataFrame(columns = ['x-tsne', 'y-tsne'])
df_out['sample_index'] = X1_index
from sklearn.manifold import TSNE
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... t-sne for loop - evaluate range of perplexity values
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
for perplex in [20,50,75,200,400,500] :
i_index = i_index + 1
tic = time.clock()
tsne = TSNE(n_components = 2, verbose = 1, perplexity = perplex, n_iter = 300)
tsne_results = tsne.fit_transform(X1)
tsne_kl_diverge = tsne.kl_divergence_
toc = time.clock()
print (toc - tic)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - save statistics for model comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
exe_time = '{0:.4f}'.format(toc-tic)
raw_data = {
'model_name' : 't-SNE - 2D',
'perplexity' : perplex,
'kl_divergence': tsne_kl_diverge,
'process_time' : exe_time
}
df_tbl = pd.DataFrame(raw_data,
columns = ['model_name',
'perplexity',
'kl_divergence',
'process_time'],
index = [i_index])
tsne_tbl = tsne_tbl.append(df_tbl)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - plot clusters
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
_ = plt.figure(figsize = (12, 8))
_ = plt.subplot(111, facecolor = 'lightgrey')
_ = plt.scatter(tsne_results[:, 0], tsne_results[:, 1],
s = 40,
c = 'darkorchid',
linewidths = 0,
alpha = 0.30)
_ = plt.xlabel('t-SNE axis 1')
_ = plt.ylabel('t-SNE axis 2')
_ = plt.suptitle("t-SNE 2-D Mapping - Perplexity = %d" %perplex)
_ = plt.title("KL Divergence = %.2f" %tsne_kl_diverge)
_ = plt.grid(True)
data_dir = 'C:/Users/Preeti/Github/CapstoneProject/unsupervised/'
data_file_base = "t_sne_mapping_perplex_"
data_file_num = "%04d" %perplex
#data_file_time = datetime.now().strftime("%Y%0m%0d_%H%M%S")
data_file_time = '18.10.19'
data_file_ext = ".png"
plt_file_2_save = data_dir + data_file_base + data_file_num + data_file_time + data_file_ext
plt.savefig(plt_file_2_save)
plt.show();
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# ... store in ../data/ directory
# ... write as .csv file for future recall
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
df_out['x-tsne'] = tsne_results[:,0];
df_out['y-tsne'] = tsne_results[:,1];
data_dir = 'C:/Users/Preeti/Github/CapstoneProject/unsupervised/'
data_file_base = "t_sne_mapping_perplex_"
data_file_num = "_%04d_" %perplex
#data_file_time = datetime.now().strftime("%Y%0m%0d_%H%M%S")
data_file_time = '18.10.19'
data_file_ext = ".csv"
file_2_write = data_dir + data_file_base + data_file_num + data_file_time + data_file_ext
print(file_2_write)
df_out.to_csv(file_2_write, index = False)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# ... end of for loop on t-sne perplex
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... read in t-SNE vectors
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
df_tsne = pd.read_csv('C:/Users/Preeti/Github/CapstoneProject/unsupervised/t_sne_mapping_perplex__0200_18.10.19.csv')
df_tsne.head()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... join t-sne vectors with base data, since we sampled to create
# ... the t-sne mapping
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
df_join = df_tsne.set_index('sample_index').join(df.set_index('id'))
df_join.fillna('0',inplace=True)
df_join.head()
col_names = df_join.columns.values.tolist()
col_names
df_join.describe().T
# set required variables for model comparison
kmeans_tbl = pd.DataFrame(columns = [
'model_name',
'n_clusters',
'inertia',
'silhouette',
'process_time'])
i_index = []
i_index = 0
# preparation for cross validation and model comparison, each classifier is appended once model is fit
models = []
# ... k-means on the t-sne vectors
X_tsne = pd.DataFrame(columns=['t1', 't2'])
X_tsne['t1'] = df_join['x-tsne']
X_tsne['t2'] = df_join['y-tsne']
for n_lda in range(2, 25):
tic = time.clock()
print ("n_lda = ", n_lda)
cls_lda = KMeans(n_clusters = n_lda,
init = 'k-means++',
random_state = 1);
cls_lda.fit(X_tsne)
kmeans_labels = cls_lda.labels_ # the labels from kmeans clustering
kmeans_centers = cls_lda.cluster_centers_
kmeans_inertia = cls_lda.inertia_
print ("inertia = ", kmeans_inertia)
kmeans_silhouette = metrics.silhouette_score(X_tsne,
kmeans_labels,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", kmeans_silhouette)
column_name = "kmeans_" + str(n_lda)
df_join[column_name] = kmeans_labels
toc = time.clock()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - save statistics for model comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
exe_time = '{0:.4f}'.format(toc-tic)
raw_data = {
'model_name' : 'KMeans - LDA features',
'n_clusters' : n_lda,
'inertia': kmeans_inertia,
'silhouette': kmeans_silhouette,
'process_time' : exe_time
}
df_tbl = pd.DataFrame(raw_data,
columns = ['model_name', 'n_clusters', 'inertia', 'silhouette', 'process_time'],
index = [i_index + 1])
kmeans_tbl = kmeans_tbl.append(df_tbl, ignore_index=True)
column_name = "kmeans_" + str(n_lda)
X_tsne[column_name] = kmeans_labels
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - make some plots of clusters
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
_ = plt.figure(figsize=(12, 8));
_ = plt.subplot(111, facecolor = 'darkgrey');
X_tsne_values = X_tsne.values;
_ = plt.scatter(X_tsne_values[:, 0], X_tsne_values[:, 1],
c = kmeans_labels,
cmap = plt.cm.Paired,
s = 50,
linewidths = 0,
alpha = 0.20);
_ = plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1],
c = range(n_lda),
cmap = plt.cm.Paired,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
for ii in range(n_lda) :
_ = plt.text(kmeans_centers[ii, 0], kmeans_centers[ii, 1], ii, fontsize = 40)
print(ii, kmeans_centers[ii, 0], kmeans_centers[ii, 1], ii)
_ = plt.xlabel('t-SNE axis 1')
_ = plt.ylabel('t-SNE axis 2');
_ = plt.grid(True);
plt.show();
kmeans_tbl
## Run k means again for the finalized custer # to recreate centers and labels
n_lda = 8
# ... k-means on the t-sne vectors
X_tsne = pd.DataFrame(columns=['t1', 't2'])
X_tsne['t1'] = df_join['x-tsne']
X_tsne['t2'] = df_join['y-tsne']
print ("n_lda = ", n_lda)
cls_lda = KMeans(n_clusters = n_lda,
init = 'k-means++',
random_state = 1);
cls_lda.fit(X_tsne)
kmeans_labels = cls_lda.labels_ # the labels from kmeans clustering
kmeans_centers = cls_lda.cluster_centers_
kmeans_inertia = cls_lda.inertia_
print ("inertia = ", kmeans_inertia)
kmeans_silhouette = metrics.silhouette_score(X_tsne,
kmeans_labels,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", kmeans_silhouette)
#column_name = "kmeans_" + str(n_lda)
#df_join[column_name] = kmeans_labels
df_join.head()
df_join.head()
X_all_together = copy.deepcopy(df_join)
len(X_all_together)
X_all_together['kmeans_labels'] = df_join['kmeans_8']
X_all_together.fillna('0',inplace=True)
X_all_together.isnull().sum()
X_all_together.to_csv("df_model_with_kmeans.csv", index=False)
col_names = X_all_together.columns.values.tolist()
col_names
col_to_remove = ['kmeans_2',
'kmeans_3',
'kmeans_4',
'kmeans_5',
'kmeans_6',
'kmeans_7',
'kmeans_8',
'kmeans_9',
'kmeans_10',
'kmeans_11',
'kmeans_12',
'kmeans_13',
'kmeans_14',
'kmeans_15',
'kmeans_16',
'kmeans_17',
'kmeans_18',
'kmeans_19',
'kmeans_20',
'kmeans_21',
'kmeans_22',
'kmeans_23',
'kmeans_24']
for col in col_to_remove:
col_names.remove(col)
col_names
X_all_together.select_dtypes(include=['number']).columns
col_to_convert_to_float = ['loan_amnt', 'annual_inc', 'dti', 'delinq_2yrs',
'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
'pub_rec_bankruptcies', 'n_term']
for col in col_to_convert_to_float:
X_all_together[col] = X_all_together[col].astype(float).fillna(0.0)
kmeans_weight_tbl = pd.DataFrame(columns = [
'Column Name',
'cluster 0',
'cluster 1',
'cluster 2',
'cluster 3',
'cluster 4',
'cluster 5',
'cluster 6',
'cluster 7'])
i_index = []
i_index = 0
for col in col_to_convert_to_float :
_ = plt.figure(figsize=(24, 8));
print('**************************',col)
# ... feature distribution color map
_ = plt.subplot(131, facecolor = 'darkgrey');
_ = plt.scatter(X_all_together['x-tsne'], X_all_together['y-tsne'],
c = X_all_together[col],
cmap = plt.cm.Spectral,
s = 50,
linewidths = 0,
alpha = 0.30)
_ = plt.title(col)
# ... feature boxplots
_ = plt.subplot(132, facecolor = 'darkgrey');
ax = sns.boxplot(x = "kmeans_labels", y = col, data = X_all_together);
average_values = X_all_together.groupby(['kmeans_labels'])[col].mean().values
average_labels = [str(np.round(s, 2)) for s in average_values]
# print("****************************************")
# print(col)
# print(average_values)
# print(average_labels)
raw_data = {
'Column Name':col,
'cluster 0':average_values[0],
'cluster 1':average_values[1],
'cluster 2':average_values[2],
'cluster 3':average_values[3],
'cluster 4':average_values[4],
'cluster 5':average_values[5],
'cluster 6':average_values[6],
'cluster 7':average_values[7]
}
df_tbl = pd.DataFrame(raw_data,
columns = ['Column Name','cluster 0','cluster 1','cluster 2','cluster 3','cluster 4','cluster 5','cluster 6','cluster 7'],
index = [i_index + 1])
kmeans_weight_tbl = kmeans_weight_tbl.append(df_tbl)
print("****************************************")
pos = range(len(average_values))
for tick, label in zip(pos, ax.get_xticklabels()):
_ = ax.text(pos[tick], average_values[tick], average_labels[tick],
horizontalalignment = 'center', size = 'small', color = 'w', weight = 'semibold')
# ... cluster color map
_ = plt.subplot(133, facecolor = 'darkgrey');
_ = plt.scatter(X_all_together['x-tsne'], X_all_together['y-tsne'],
c = kmeans_labels,
cmap = plt.cm.tab20,
s = 50,
linewidths = 0,
alpha = 0.30)
_ = plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1],
c = range(n_lda),
cmap = plt.cm.tab20b,
s = 200,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.50);
for ii in range(n_lda) :
_ = plt.text(kmeans_centers[ii, 0], kmeans_centers[ii, 1], ii, fontsize = 20)
_ = plt.xlabel('t-SNE axis 1')
_ = plt.ylabel('t-SNE axis 2')
_ = plt.title('t-SNE 2-D mapping')
data_dir = 'C:/Users/Preeti/Github/CapstoneProject/unsupervised/plots/'
data_file_base = "Kmeans_comparison_"
data_file_num = str(col)
#data_file_time = datetime.now().strftime("%Y%0m%0d_%H%M%S")
data_file_time = '18.10.19'
data_file_ext = ".png"
plt_file_2_save = data_dir + data_file_base + data_file_num + data_file_time + data_file_ext
plt.savefig(plt_file_2_save)
_ = plt.show();
kmeans_weight_tbl.reset_index(inplace=True)
kmeans_weight_tbl
#for row in kmeans_weight_tbl.iterrows():
sum = kmeans_weight_tbl['cluster 0'] + kmeans_weight_tbl['cluster 1']+ kmeans_weight_tbl['cluster 2'] + kmeans_weight_tbl['cluster 3'] + kmeans_weight_tbl['cluster 4']+ kmeans_weight_tbl['cluster 5']+ kmeans_weight_tbl['cluster 6']+ kmeans_weight_tbl['cluster 7']
kmeans_weight_tbl['Cluster #0'] = (kmeans_weight_tbl['cluster 0']/sum ) * 100
kmeans_weight_tbl['Cluster #1'] = (kmeans_weight_tbl['cluster 1']/sum ) * 100
kmeans_weight_tbl['Cluster #2'] = (kmeans_weight_tbl['cluster 2']/sum ) * 100
kmeans_weight_tbl['Cluster #3'] = (kmeans_weight_tbl['cluster 3']/sum ) * 100
kmeans_weight_tbl['Cluster #4'] = (kmeans_weight_tbl['cluster 4']/sum ) * 100
kmeans_weight_tbl['Cluster #5'] = (kmeans_weight_tbl['cluster 5']/sum ) * 100
kmeans_weight_tbl['Cluster #6'] = (kmeans_weight_tbl['cluster 6']/sum ) * 100
kmeans_weight_tbl['Cluster #7'] = (kmeans_weight_tbl['cluster 7']/sum ) * 100
print(sum)
kmeans_weight_tbl
kmeans_cols = ['Cluster #0', 'Cluster #1', 'Cluster #2', 'Cluster #3', 'Cluster #4', 'Cluster #5', 'Cluster #6', 'Cluster #7']
kmeans_cols
sns.set_style('whitegrid')
sns.set(rc={'figure.figsize':(13,9)})
for col in kmeans_cols:
#sns.boxplot(y=col, x="index", data=kmeans_weight_tbl_plot,palette='rainbow')
sns.barplot(y="Column Name", x=col, data=kmeans_weight_tbl,palette='Set2')
#sns.barplot(x='sex',y='total_bill',data=tips)
data_dir = 'C:/Users/Preeti/Github/CapstoneProject/unsupervised/plots/'
data_file_base = "Kmeans_cluster_"
data_file_num = str(col)
#data_file_time = datetime.now().strftime("%Y%0m%0d_%H%M%S")
data_file_time = '18.10.19'
data_file_ext = ".png"
plt_file_2_save = data_dir + data_file_base + data_file_num + data_file_time + data_file_ext
plt.savefig(plt_file_2_save)
plt.show()
# set required variables for model comparison
comparison_tbl = pd.DataFrame(columns = [
'model_name',
'n_clusters',
'inertia',
'silhouette',
'process_time'])
i_index = []
i_index = 0
# preparation for cross validation and model comparison, each classifier is appended once model is fit
models = []
from scipy.cluster.hierarchy import dendrogram, linkage
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
# Using the dendrogram to find the optimal number of clusters
dendrogram = sch.dendrogram(sch.linkage(X_tsne, method = 'complete'))
_ = plt.title('Dendrogram, type=complete')
_ = plt.xlabel('X_tsne')
_ = plt.ylabel('Euclidean distances')
plt.savefig("C:/Users/Preeti/Github/CapstoneProject/unsupervised/plots/hierarchical/dendrogram_complete.png")
_ = plt.show()
# Using the dendrogram to find the optimal number of clusters
dendrogram = sch.dendrogram(sch.linkage(X_tsne, method = 'average'))
_ = plt.title('Dendrogram, type=average')
_ = plt.xlabel('X_tsne')
_ = plt.ylabel('Euclidean distances')
plt.savefig("C:/Users/Preeti/Github/CapstoneProject/unsupervised/plots/hierarchical/dendrogram_average.png")
_ = plt.show()
# Using the dendrogram to find the optimal number of clusters
dendrogram = sch.dendrogram(sch.linkage(X_tsne, method = 'ward'))
_ = plt.title('Dendrogram, type=ward')
_ = plt.xlabel('X_tsne')
_ = plt.ylabel('Euclidean distances')
plt.savefig("C:/Users/Preeti/Github/CapstoneProject/unsupervised/plots/hierarchical/dendrogram_ward.png")
_ = plt.show()
n_optimal_cluster_ward = 3
n_optimal_cluster_average = 5
n_optimal_cluster_complete = 4
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... optimal models for linkage = ward
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
tic = time.clock()
hc_ward = AgglomerativeClustering(n_clusters = n_optimal_cluster_ward, affinity = 'euclidean', linkage = 'ward')
hc_ward.fit(X_tsne)
hc_labels_ward = hc_ward.labels_ # the labels from hierarchical clustering
#hc_centers = hc_lda.cluster_centers_
# hc_inertia = hc_lda.inertia_
#print ("inertia = ", hc_inertia)
hc_silhouette = metrics.silhouette_score(X_tsne,
hc_labels_ward,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", hc_silhouette)
toc = time.clock()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - save statistics for model comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
exe_time = '{0:.4f}'.format(toc-tic)
raw_data = {
'model_name' : 'hierarchical - ward',
'n_clusters' : n_optimal_cluster_ward,
'inertia': 'NA',
'silhouette': hc_silhouette,
'process_time' : exe_time
}
df_tbl = pd.DataFrame(raw_data,
columns = ['model_name', 'n_clusters', 'inertia', 'silhouette', 'process_time'],
index = [i_index + 1])
comparison_tbl = comparison_tbl.append(df_tbl)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... optimal models for linkage = complete, #of clusters = 3
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
tic = time.clock()
hc_complete = AgglomerativeClustering(n_clusters = n_optimal_cluster_complete, affinity = 'euclidean', linkage = 'complete')
hc_complete.fit(X_tsne)
hc_labels_complete = hc_complete.labels_ # the labels from hierarchical clustering
#hc_centers = hc_lda.cluster_centers_
# hc_inertia = hc_lda.inertia_
#print ("inertia = ", hc_inertia)
hc_silhouette = metrics.silhouette_score(X_tsne,
hc_labels_complete,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", hc_silhouette)
toc = time.clock()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - save statistics for model comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
exe_time = '{0:.4f}'.format(toc-tic)
raw_data = {
'model_name' : 'hierarchical - complete',
'n_clusters' : n_optimal_cluster_complete,
'inertia': 'NA',
'silhouette': hc_silhouette,
'process_time' : exe_time
}
df_tbl = pd.DataFrame(raw_data,
columns = ['model_name', 'n_clusters', 'inertia', 'silhouette', 'process_time'],
index = [i_index + 1])
comparison_tbl = comparison_tbl.append(df_tbl)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... optimal models for linkage = average #of clusters = 4
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
tic = time.clock()
hc_average = AgglomerativeClustering(n_clusters = n_optimal_cluster_average, affinity = 'euclidean', linkage = 'average')
hc_average.fit(X_tsne)
hc_labels_average = hc_average.labels_ # the labels from hierarchical clustering
#hc_centers = hc_lda.cluster_centers_
# hc_inertia = hc_lda.inertia_
#print ("inertia = ", hc_inertia)
hc_silhouette = metrics.silhouette_score(X_tsne,
hc_labels_average,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", hc_silhouette)
toc = time.clock()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - save statistics for model comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
exe_time = '{0:.4f}'.format(toc-tic)
raw_data = {
'model_name' : 'hierarchical - average',
'n_clusters' : n_optimal_cluster_average,
'inertia': 'NA',
'silhouette': hc_silhouette,
'process_time' : exe_time
}
df_tbl = pd.DataFrame(raw_data,
columns = ['model_name', 'n_clusters', 'inertia', 'silhouette', 'process_time'],
index = [i_index + 1])
comparison_tbl = comparison_tbl.append(df_tbl)
comparison_tbl = comparison_tbl.reset_index(drop=True)
comparison_tbl['process_time'] = pd.to_numeric(comparison_tbl['process_time'])
comparison_tbl
sns.set_context('poster')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}
palette = sns.color_palette('deep', np.unique(hc_labels_ward).max() + 1)
colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in hc_labels_ward]
X_tsne_values = X_tsne.values;
_= plt.scatter(X_tsne_values[:, 0], X_tsne_values[:, 1], c=colors, **plot_kwds)
frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)
_= plt.title('Hierarchical Clusters, type = ward ' , fontsize=16)
_ = plt.xlabel("X_tsne");
_ = plt.ylabel("y_tsne");
plt.savefig("C:/Users/Preeti/Github/CapstoneProject/unsupervised/plots/hierarchical/hc_ward_eval.png")
_=plt.show()
palette = sns.color_palette('deep', np.unique(hc_labels_average).max() + 1)
colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in hc_labels_average]
X_tsne_values = X_tsne.values;
_= plt.scatter(X_tsne_values[:, 0], X_tsne_values[:, 1], c=colors, **plot_kwds)
frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)
_= plt.title('Hierarchical Clusters, type = average' , fontsize=16)
plt.savefig("C:/Users/Preeti/Github/CapstoneProject/unsupervised/plots/hierarchical/hc_average_eval.png")
_=plt.show()
palette = sns.color_palette('deep', np.unique(hc_labels_complete).max() + 1)
colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in hc_labels_complete]
X_tsne_values = X_tsne.values;
_= plt.scatter(X_tsne_values[:, 0], X_tsne_values[:, 1], c=colors, **plot_kwds)
frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)
_= plt.title('Hierarchical Clusters, type = complete' , fontsize=16)
plt.savefig("C:/Users/Preeti/Github/CapstoneProject/unsupervised/plots/hierarchical/hc_complete_eval.png")
_=plt.show()
X_all_together['hierarchical'] = hc_labels_average
hc_labels_average
hc_average_centers_x = X_all_together.groupby(['hierarchical'])['x-tsne'].mean()
hc_average_centers_y= X_all_together.groupby(['hierarchical'])['y-tsne'].mean()
hc_average_centers_x
hc_average_centers_y
n_lda = np.unique(hc_labels_average).max() +1
n_lda
#### feature importance for complete method
X_HC_average_analysis = copy.deepcopy(X_all_together)
del X_HC_average_analysis['x-tsne']
del X_HC_average_analysis['y-tsne']
#del X_HC_average_analysis['sample_index']
grouped = X_HC_average_analysis.groupby(['hierarchical'])
#print(grouped)
X_HC_average_analysis = (grouped.mean() - X_HC_average_analysis.mean()) / X_HC_average_analysis.std()
#HC_Complete_analysis_tbl
# boxplot across clusters for each feature ...
import seaborn as sns
sns.set_palette("husl")
plt.style.use('classic')
col_names = X_all_together.columns.values.tolist()
for col in col_names :
print("************************* ",col)
_ = plt.figure(figsize=(24, 8));
# ... feature distribution color map
_ = plt.subplot(131, facecolor = 'darkgrey');
_ = plt.scatter(X_all_together['x-tsne'], X_all_together['y-tsne'],
c = X_all_together[col],
cmap = plt.cm.Spectral,
s = 50,
linewidths = 0,
alpha = 0.30)
_ = plt.title(col)
# ... feature boxplots
_ = plt.subplot(132, facecolor = 'darkgrey');
ax = sns.boxplot(x = "hierarchical", y = col, data = X_all_together);
#sns.axlabel(xlabel="hc_labels_complete");
_ = plt.xlabel("hc_labels_average");
_ = plt.title(col)
average_values = X_all_together.groupby(['hierarchical'])[col].mean().values
average_labels = [str(np.round(s, 2)) for s in average_values]
pos = range(len(average_values))
for tick, label in zip(pos, ax.get_xticklabels()):
_ = ax.text(pos[tick], average_values[tick], average_labels[tick],
horizontalalignment = 'center', size = 'small', color = 'w', weight = 'semibold')
# ... cluster color map
_ = plt.subplot(133, facecolor = 'darkgrey');
_ = plt.scatter(X_all_together['x-tsne'], X_all_together['y-tsne'],
c = hc_labels_average,
cmap = plt.cm.tab20,
s = 50,
linewidths = 0,
alpha = 0.30)
for ii in range(n_lda) :
_ = plt.text(hc_average_centers_x[ii], hc_average_centers_y[ii], ii, fontsize = 20)
_ = plt.xlabel('t-SNE axis 1')
_ = plt.ylabel('t-SNE axis 2')
_ = plt.title('t-SNE 2-D mapping')
_ = plt.title(col)
plt.savefig("C:/Users/Preeti/Github/CapstoneProject/unsupervised/plots/hierarchical/HC_average_%s.png" %col)
_ = plt.show();